import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use('bmh')
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
%config InlineBackend.figure_format='retina'
bank_df = pd.read_excel(r'C:\Data\Banking Data.xlsx')
bank_df
| banking marketing | Unnamed: 1 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | Unnamed: 10 | Unnamed: 11 | Unnamed: 12 | Unnamed: 13 | Unnamed: 14 | Unnamed: 15 | Unnamed: 16 | Unnamed: 17 | Unnamed: 18 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | customer id and age. | NaN | Customer salary and balance. | NaN | Customer marital status and job with education... | NaN | particular customer before targeted or not | NaN | Loan types: loans or housing loans | NaN | Contact type | NaN | month of contact | duration of call | NaN | NaN | NaN | outcome of previous contact | response of customer after call happned |
| 1 | customerid | age | salary | balance | marital | jobedu | targeted | default | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | response |
| 2 | 1 | 58 | 100000 | 2143 | married | management,tertiary | yes | no | yes | no | unknown | 5 | may, 2017 | 261 sec | 1 | -1 | 0 | unknown | no |
| 3 | 2 | 44 | 60000 | 29 | single | technician,secondary | yes | no | yes | no | unknown | 5 | may, 2017 | 151 sec | 1 | -1 | 0 | unknown | no |
| 4 | 3 | 33 | 120000 | 2 | married | entrepreneur,secondary | yes | no | yes | yes | unknown | 5 | may, 2017 | 76 sec | 1 | -1 | 0 | unknown | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 45208 | 45207 | 51 | 60000 | 825 | married | technician,tertiary | yes | no | no | no | cellular | 17 | nov, 2017 | 16.2833333333333 min | 3 | -1 | 0 | unknown | yes |
| 45209 | 45208 | 71 | 55000 | 1729 | divorced | retired,primary | yes | no | no | no | cellular | 17 | nov, 2017 | 7.6 min | 2 | -1 | 0 | unknown | yes |
| 45210 | 45209 | 72 | 55000 | 5715 | married | retired,secondary | yes | no | no | no | cellular | 17 | nov, 2017 | 18.7833333333333 min | 5 | 184 | 3 | success | yes |
| 45211 | 45210 | 57 | 20000 | 668 | married | blue-collar,secondary | yes | no | no | no | telephone | 17 | nov, 2017 | 8.46666666666667 min | 4 | -1 | 0 | unknown | no |
| 45212 | 45211 | 37 | 120000 | 2971 | married | entrepreneur,secondary | yes | no | no | no | cellular | 17 | nov, 2017 | 6.01666666666667 min | 2 | 188 | 11 | other | no |
45213 rows × 19 columns
bank_df.columns
Index(['banking marketing', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3',
'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8',
'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12',
'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16',
'Unnamed: 17', 'Unnamed: 18'],
dtype='object')
list(bank_df[1:2].values)
[array(['customerid', 'age', 'salary', 'balance', 'marital', 'jobedu',
'targeted', 'default', 'housing', 'loan', 'contact', 'day',
'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome',
'response'], dtype=object)]
bank_df.columns = ('customer_id', 'customer_age', 'salary', 'balance', 'marital', 'job_edu',
'targeted_before_or_not', 'default', 'housing_loan', 'loan', 'contact', 'contact_day',
'contact_month', 'durationof_call', 'campaign', 'pdays', 'previous', 'prv_outcome',
'response_after_call')
bank_df.head()
| customer_id | customer_age | salary | balance | marital | job_edu | targeted_before_or_not | default | housing_loan | loan | contact | contact_day | contact_month | durationof_call | campaign | pdays | previous | prv_outcome | response_after_call | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | customer id and age. | NaN | Customer salary and balance. | NaN | Customer marital status and job with education... | NaN | particular customer before targeted or not | NaN | Loan types: loans or housing loans | NaN | Contact type | NaN | month of contact | duration of call | NaN | NaN | NaN | outcome of previous contact | response of customer after call happned |
| 1 | customerid | age | salary | balance | marital | jobedu | targeted | default | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | response |
| 2 | 1 | 58 | 100000 | 2143 | married | management,tertiary | yes | no | yes | no | unknown | 5 | may, 2017 | 261 sec | 1 | -1 | 0 | unknown | no |
| 3 | 2 | 44 | 60000 | 29 | single | technician,secondary | yes | no | yes | no | unknown | 5 | may, 2017 | 151 sec | 1 | -1 | 0 | unknown | no |
| 4 | 3 | 33 | 120000 | 2 | married | entrepreneur,secondary | yes | no | yes | yes | unknown | 5 | may, 2017 | 76 sec | 1 | -1 | 0 | unknown | no |
bank_df=bank_df.drop([0,1])
bank_df.head()
| customer_id | customer_age | salary | balance | marital | job_edu | targeted_before_or_not | default | housing_loan | loan | contact | contact_day | contact_month | durationof_call | campaign | pdays | previous | prv_outcome | response_after_call | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 1 | 58 | 100000 | 2143 | married | management,tertiary | yes | no | yes | no | unknown | 5 | may, 2017 | 261 sec | 1 | -1 | 0 | unknown | no |
| 3 | 2 | 44 | 60000 | 29 | single | technician,secondary | yes | no | yes | no | unknown | 5 | may, 2017 | 151 sec | 1 | -1 | 0 | unknown | no |
| 4 | 3 | 33 | 120000 | 2 | married | entrepreneur,secondary | yes | no | yes | yes | unknown | 5 | may, 2017 | 76 sec | 1 | -1 | 0 | unknown | no |
| 5 | 4 | 47 | 20000 | 1506 | married | blue-collar,unknown | no | no | yes | no | unknown | 5 | may, 2017 | 92 sec | 1 | -1 | 0 | unknown | no |
| 6 | 5 | 33 | 0 | 1 | single | unknown,unknown | no | no | no | no | unknown | 5 | may, 2017 | 198 sec | 1 | -1 | 0 | unknown | no |
bank_df.reset_index(drop=True,inplace=True)
bank_df.head()
| customer_id | customer_age | salary | balance | marital | job_edu | targeted_before_or_not | default | housing_loan | loan | contact | contact_day | contact_month | durationof_call | campaign | pdays | previous | prv_outcome | response_after_call | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 58 | 100000 | 2143 | married | management,tertiary | yes | no | yes | no | unknown | 5 | may, 2017 | 261 sec | 1 | -1 | 0 | unknown | no |
| 1 | 2 | 44 | 60000 | 29 | single | technician,secondary | yes | no | yes | no | unknown | 5 | may, 2017 | 151 sec | 1 | -1 | 0 | unknown | no |
| 2 | 3 | 33 | 120000 | 2 | married | entrepreneur,secondary | yes | no | yes | yes | unknown | 5 | may, 2017 | 76 sec | 1 | -1 | 0 | unknown | no |
| 3 | 4 | 47 | 20000 | 1506 | married | blue-collar,unknown | no | no | yes | no | unknown | 5 | may, 2017 | 92 sec | 1 | -1 | 0 | unknown | no |
| 4 | 5 | 33 | 0 | 1 | single | unknown,unknown | no | no | no | no | unknown | 5 | may, 2017 | 198 sec | 1 | -1 | 0 | unknown | no |
bank_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 45211 entries, 0 to 45210 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_id 45211 non-null object 1 customer_age 45191 non-null object 2 salary 45211 non-null object 3 balance 45211 non-null object 4 marital 45211 non-null object 5 job_edu 45211 non-null object 6 targeted_before_or_not 45211 non-null object 7 default 45211 non-null object 8 housing_loan 45211 non-null object 9 loan 45211 non-null object 10 contact 45211 non-null object 11 contact_day 45211 non-null object 12 contact_month 45161 non-null object 13 durationof_call 45211 non-null object 14 campaign 45211 non-null object 15 pdays 45211 non-null object 16 previous 45211 non-null object 17 prv_outcome 45211 non-null object 18 response_after_call 45181 non-null object dtypes: object(19) memory usage: 6.6+ MB
bank_df.customer_id=bank_df.customer_id.astype(int)
bank_df.customer_age=bank_df.customer_age.astype(float)
bank_df.balance = bank_df.balance.astype(int)
bank_df.contact_day = bank_df.contact_day.astype(int)
bank_df.campaign = bank_df.campaign.astype(int)
bank_df.pdays = bank_df.pdays.astype(int)
bank_df.previous = bank_df.previous.astype(int)
bank_df.salary=bank_df['salary'].replace('?',np.nan)
bank_df.salary = bank_df.salary.astype(float)
bank_df.isna().sum()
customer_id 0 customer_age 20 salary 62 balance 0 marital 0 job_edu 0 targeted_before_or_not 0 default 0 housing_loan 0 loan 0 contact 0 contact_day 0 contact_month 50 durationof_call 0 campaign 0 pdays 0 previous 0 prv_outcome 0 response_after_call 30 dtype: int64
# Dealing with salary cloumn to fill null values
bank_df.salary.mean()
bank_df.salary.median()
bank_df.salary.plot(kind='box')
# Because of no outliers we can fill null values with mean
bank_df.salary.replace(np.nan,bank_df.salary.mean(),inplace=True)
57084.45369775632
60000.0
<AxesSubplot:>
bank_df.isna().sum()
customer_id 0 customer_age 20 salary 0 balance 0 marital 0 job_edu 0 targeted_before_or_not 0 default 0 housing_loan 0 loan 0 contact 0 contact_day 0 contact_month 50 durationof_call 0 campaign 0 pdays 0 previous 0 prv_outcome 0 response_after_call 30 dtype: int64
#1) Dealing with null values of customer column age
bank_df.customer_age.mean()
bank_df.customer_age.median()
bank_df.customer_age=bank_df.customer_age.fillna(bank_df.customer_age.mean())
# replacing null values with mean of customer age
40.93565090394105
39.0
bank_df.customer_age.isna().sum()
0
#2) dealing with customer contact month column
# Filling the null values with interpolate
bank_df.contact_month.interpolate(method='ffill',limit_direction='forward',inplace=True)
bank_df.contact_month.isna().sum()
0
bank_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 45211 entries, 0 to 45210 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_id 45211 non-null int32 1 customer_age 45211 non-null float64 2 salary 45211 non-null float64 3 balance 45211 non-null int32 4 marital 45211 non-null object 5 job_edu 45211 non-null object 6 targeted_before_or_not 45211 non-null object 7 default 45211 non-null object 8 housing_loan 45211 non-null object 9 loan 45211 non-null object 10 contact 45211 non-null object 11 contact_day 45211 non-null int32 12 contact_month 45211 non-null object 13 durationof_call 45211 non-null object 14 campaign 45211 non-null int32 15 pdays 45211 non-null int32 16 previous 45211 non-null int32 17 prv_outcome 45211 non-null object 18 response_after_call 45181 non-null object dtypes: float64(2), int32(6), object(11) memory usage: 5.5+ MB
#3) dealing with null values of (response after call) column
bank_df.response_after_call.astype(str).mode()
0 no Name: response_after_call, dtype: object
bank_df.response_after_call.interpolate(method='ffill',limit_direction='forward',inplace=True)
# Merging day column with month column of dataframe
bank_df['contact_Date'] = bank_df['contact_day'].astype(str)+bank_df['contact_month']
bank_df.head()
| customer_id | customer_age | salary | balance | marital | job_edu | targeted_before_or_not | default | housing_loan | loan | contact | contact_day | contact_month | durationof_call | campaign | pdays | previous | prv_outcome | response_after_call | contact_Date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 58.0 | 100000.0 | 2143 | married | management,tertiary | yes | no | yes | no | unknown | 5 | may, 2017 | 261 sec | 1 | -1 | 0 | unknown | no | 5may, 2017 |
| 1 | 2 | 44.0 | 60000.0 | 29 | single | technician,secondary | yes | no | yes | no | unknown | 5 | may, 2017 | 151 sec | 1 | -1 | 0 | unknown | no | 5may, 2017 |
| 2 | 3 | 33.0 | 120000.0 | 2 | married | entrepreneur,secondary | yes | no | yes | yes | unknown | 5 | may, 2017 | 76 sec | 1 | -1 | 0 | unknown | no | 5may, 2017 |
| 3 | 4 | 47.0 | 20000.0 | 1506 | married | blue-collar,unknown | no | no | yes | no | unknown | 5 | may, 2017 | 92 sec | 1 | -1 | 0 | unknown | no | 5may, 2017 |
| 4 | 5 | 33.0 | 0.0 | 1 | single | unknown,unknown | no | no | no | no | unknown | 5 | may, 2017 | 198 sec | 1 | -1 | 0 | unknown | no | 5may, 2017 |
# Changing contact_date cloumn to dateformat
from datetime import datetime
bank_df['Con_Date'] = pd.to_datetime(bank_df.contact_Date,format='%d%b, %Y')
# Droping unrequired cloumn of date
bank_df.drop(['contact_day','contact_month','contact_Date'],axis=1,inplace=True)
bank_df.to_csv(r'C:\Data\Banking_cleaned_Data.csv')
bank_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 45211 entries, 0 to 45210 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_id 45211 non-null int32 1 customer_age 45211 non-null float64 2 salary 45211 non-null float64 3 balance 45211 non-null int32 4 marital 45211 non-null object 5 job_edu 45211 non-null object 6 targeted_before_or_not 45211 non-null object 7 default 45211 non-null object 8 housing_loan 45211 non-null object 9 loan 45211 non-null object 10 contact 45211 non-null object 11 durationof_call 45211 non-null object 12 campaign 45211 non-null int32 13 pdays 45211 non-null int32 14 previous 45211 non-null int32 15 prv_outcome 45211 non-null object 16 response_after_call 45211 non-null object 17 Con_Date 45211 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(2), int32(5), object(10) memory usage: 5.3+ MB
bank_df.head()
| customer_id | customer_age | salary | balance | marital | job_edu | targeted_before_or_not | default | housing_loan | loan | contact | durationof_call | campaign | pdays | previous | prv_outcome | response_after_call | Con_Date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 58.0 | 100000.0 | 2143 | married | management,tertiary | yes | no | yes | no | unknown | 261 sec | 1 | -1 | 0 | unknown | no | 2017-05-05 |
| 1 | 2 | 44.0 | 60000.0 | 29 | single | technician,secondary | yes | no | yes | no | unknown | 151 sec | 1 | -1 | 0 | unknown | no | 2017-05-05 |
| 2 | 3 | 33.0 | 120000.0 | 2 | married | entrepreneur,secondary | yes | no | yes | yes | unknown | 76 sec | 1 | -1 | 0 | unknown | no | 2017-05-05 |
| 3 | 4 | 47.0 | 20000.0 | 1506 | married | blue-collar,unknown | no | no | yes | no | unknown | 92 sec | 1 | -1 | 0 | unknown | no | 2017-05-05 |
| 4 | 5 | 33.0 | 0.0 | 1 | single | unknown,unknown | no | no | no | no | unknown | 198 sec | 1 | -1 | 0 | unknown | no | 2017-05-05 |
bank_df.describe()
| customer_id | customer_age | salary | balance | campaign | pdays | previous | |
|---|---|---|---|---|---|---|---|
| count | 45211.000000 | 45211.000000 | 45211.000000 | 45211.000000 | 45211.000000 | 45211.000000 | 45211.000000 |
| mean | 22606.000000 | 40.935651 | 57084.453698 | 1362.272058 | 2.763841 | 40.197828 | 0.580323 |
| std | 13051.435847 | 10.616848 | 32016.099611 | 3044.765829 | 3.098021 | 100.128746 | 2.303441 |
| min | 1.000000 | 18.000000 | 0.000000 | -8019.000000 | 1.000000 | -1.000000 | 0.000000 |
| 25% | 11303.500000 | 33.000000 | 20000.000000 | 72.000000 | 1.000000 | -1.000000 | 0.000000 |
| 50% | 22606.000000 | 39.000000 | 60000.000000 | 448.000000 | 2.000000 | -1.000000 | 0.000000 |
| 75% | 33908.500000 | 48.000000 | 70000.000000 | 1428.000000 | 3.000000 | -1.000000 | 0.000000 |
| max | 45211.000000 | 95.000000 | 120000.000000 | 102127.000000 | 63.000000 | 871.000000 | 275.000000 |
plt.figure(figsize=(12,5))
plt.hist(bank_df.customer_age,edgecolor = 'b', color= 'orange');
1)We can see that maximum count of customer of age who have account in bank is bitween 25 to 40.
2)After 55 age of customers,there is very minimum no of account.
plt.figure(figsize=(12,5))
plt.hist(bank_df.salary,edgecolor = 'b', color= 'orange');
Observation= Maximum no of salary of customers are 10k-20k, 50k-70k and 1lakh-1.20lakh
plt.figure(figsize=(12,5))
sns.boxplot(data=bank_df,x='salary');
observation = Range of salary is 20k to 70k and there is no outliers in salary and median of salary is 60k.
plt.figure(figsize=(12,5))
sns.boxplot(data=bank_df,x='balance');
observation = Maximum customers save there salary bitween 0 to 1500 rs and there are some customers who save large amount of there salary.
plt.figure(figsize=(12,5))
sns.boxplot(data=bank_df,x='customer_age');
observation= Age of customers bitween 32 to 47 have maximum no of account in this bank
plt.figure(figsize=(12,5))
sns.boxplot(data=bank_df,x='campaign');
#starting point
bank_df1 = pd.read_csv(r'C:\Data\Banking_cleaned_Data.csv')
from datetime import datetime
bank_df1['Con_Date'] = pd.to_datetime(bank_df1.Con_Date)
bank_df1.drop('Unnamed: 0',axis=1,inplace=True)
bank_df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 45211 entries, 0 to 45210 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_id 45211 non-null int64 1 customer_age 45211 non-null float64 2 salary 45211 non-null float64 3 balance 45211 non-null int64 4 marital 45211 non-null object 5 job_edu 45211 non-null object 6 targeted_before_or_not 45211 non-null object 7 default 45211 non-null object 8 housing_loan 45211 non-null object 9 loan 45211 non-null object 10 contact 45211 non-null object 11 durationof_call 45211 non-null object 12 campaign 45211 non-null int64 13 pdays 45211 non-null int64 14 previous 45211 non-null int64 15 prv_outcome 45211 non-null object 16 response_after_call 45211 non-null object 17 Con_Date 45211 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(2), int64(5), object(10) memory usage: 6.2+ MB
bank_df1.head()
| customer_id | customer_age | salary | balance | marital | job_edu | targeted_before_or_not | default | housing_loan | loan | contact | durationof_call | campaign | pdays | previous | prv_outcome | response_after_call | Con_Date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 58.0 | 100000.0 | 2143 | married | management,tertiary | yes | no | yes | no | unknown | 261 sec | 1 | -1 | 0 | unknown | no | 2017-05-05 |
| 1 | 2 | 44.0 | 60000.0 | 29 | single | technician,secondary | yes | no | yes | no | unknown | 151 sec | 1 | -1 | 0 | unknown | no | 2017-05-05 |
| 2 | 3 | 33.0 | 120000.0 | 2 | married | entrepreneur,secondary | yes | no | yes | yes | unknown | 76 sec | 1 | -1 | 0 | unknown | no | 2017-05-05 |
| 3 | 4 | 47.0 | 20000.0 | 1506 | married | blue-collar,unknown | no | no | yes | no | unknown | 92 sec | 1 | -1 | 0 | unknown | no | 2017-05-05 |
| 4 | 5 | 33.0 | 0.0 | 1 | single | unknown,unknown | no | no | no | no | unknown | 198 sec | 1 | -1 | 0 | unknown | no | 2017-05-05 |
plt.figure(figsize=(12,5))
sns.countplot(data=bank_df1,x='loan');
Maximum customer do use to take loan from bank apprx 7000 taken loan from bank out of 35000
plt.figure(figsize=(12,5))
sns.countplot(data=bank_df1,x='housing_loan');
Maximum customer do use to take loan from bank apprx 7000 taken loan from bank out of 35000
plt.figure(figsize=(12,5))
sns.countplot(data=bank_df1,x='housing_loan');
obseravation =1) 25000 customer take house loan out of 45211 customers 2) 20000 cutomer do not take house loan
plt.figure(figsize=(12,5))
sns.countplot(data=bank_df1,x='marital');
observation = 1)28k customers are maried
2)13k customers are single
3)5k customers are divorced
plt.figure(figsize=(12,5))
sns.countplot(data=bank_df1,x='response_after_call');
only 8% customer responed after call out of 100%
sns.scatterplot(data =bank_df1, x = 'balance', y= 'customer_age');
In scatter plot we can see that maximum peopal save there money is below 20000 and the age of customer between 30 to 60 saves more money.
sns.scatterplot(data = bank_df1, x = 'salary', y= 'customer_age');
sns.pairplot(bank_df1);
bank_df1.plot.hexbin(x='salary',
y='customer_age',
reduce_C_function=np.sum,
gridsize=10)
<AxesSubplot:xlabel='salary', ylabel='customer_age'>
sns.barplot(data = bank_df1, x = 'marital', y = 'salary', ci = None, palette='bright');
maximum no. of divorced peopal have salary above 60,000
sns.barplot(data = bank_df1, x = 'marital', y = 'balance', ci = None, palette='bright');
Maximum no of maried peopal save there salary.
sns.barplot(data = bank_df1, x = 'housing_loan', y = 'salary', ci = None, palette='bright');
plt.figure(dpi = 200)
sns.displot(data = bank_df1, x = 'salary', height = 5, aspect = 15/5,kde = True);
<Figure size 1200x800 with 0 Axes>
maximum no of customer's earning salary are 20k, 60k and 1lakh.
plt.figure(dpi = 200)
sns.displot(data = bank_df1, x = 'balance', height = 5, aspect = 15/5,kde = True);
<Figure size 1200x800 with 0 Axes>
approx 2000 customers save there salary up to 1500
sns.heatmap(bank_df1.corr(), cmap = 'rainbow', annot=True);
bank_df1.corr()
| customer_id | customer_age | salary | balance | campaign | pdays | previous | |
|---|---|---|---|---|---|---|---|
| customer_id | 1.000000 | 0.014902 | 0.040000 | 0.073639 | -0.102884 | 0.437729 | 0.271098 |
| customer_age | 0.014902 | 1.000000 | 0.026233 | 0.097744 | 0.004851 | -0.023765 | 0.001329 |
| salary | 0.040000 | 0.026233 | 1.000000 | 0.056322 | 0.015300 | -0.015560 | 0.014186 |
| balance | 0.073639 | 0.097744 | 0.056322 | 1.000000 | -0.014578 | 0.003435 | 0.016674 |
| campaign | -0.102884 | 0.004851 | 0.015300 | -0.014578 | 1.000000 | -0.088628 | -0.032855 |
| pdays | 0.437729 | -0.023765 | -0.015560 | 0.003435 | -0.088628 | 1.000000 | 0.454820 |
| previous | 0.271098 | 0.001329 | 0.014186 | 0.016674 | -0.032855 | 0.454820 | 1.000000 |
Observation= 1)Costomer id is directly proportional to previous day 2)Maximum column are not proportonal to each other, show almost 0 relation with each other
bank_df1.describe()
| customer_id | customer_age | salary | balance | campaign | pdays | previous | |
|---|---|---|---|---|---|---|---|
| count | 45211.000000 | 45211.000000 | 45211.000000 | 45211.000000 | 45211.000000 | 45211.000000 | 45211.000000 |
| mean | 22606.000000 | 40.935651 | 57084.453698 | 1362.272058 | 2.763841 | 40.197828 | 0.580323 |
| std | 13051.435847 | 10.616848 | 32016.099611 | 3044.765829 | 3.098021 | 100.128746 | 2.303441 |
| min | 1.000000 | 18.000000 | 0.000000 | -8019.000000 | 1.000000 | -1.000000 | 0.000000 |
| 25% | 11303.500000 | 33.000000 | 20000.000000 | 72.000000 | 1.000000 | -1.000000 | 0.000000 |
| 50% | 22606.000000 | 39.000000 | 60000.000000 | 448.000000 | 2.000000 | -1.000000 | 0.000000 |
| 75% | 33908.500000 | 48.000000 | 70000.000000 | 1428.000000 | 3.000000 | -1.000000 | 0.000000 |
| max | 45211.000000 | 95.000000 | 120000.000000 | 102127.000000 | 63.000000 | 871.000000 | 275.000000 |
bank_df1.mean()
bank_df1.median()
customer_id 22606.000000 customer_age 40.935651 salary 57084.453698 balance 1362.272058 campaign 2.763841 pdays 40.197828 previous 0.580323 dtype: float64
customer_id 22606.0 customer_age 39.0 salary 60000.0 balance 448.0 campaign 2.0 pdays -1.0 previous 0.0 dtype: float64
Observation= In customer's balance and previous day column there is a large difference b/n mean and meadian,it means this column consist maximum no. of outliears
# salary
sns.displot(data=bank_df1,x='salary',kde=True)
<seaborn.axisgrid.FacetGrid at 0x1c9d2e12c70>
np.mean(bank_df1.salary)
np.std(bank_df1.salary)
np.mean(bank_df1.salary)+np.std(bank_df1.salary)
np.mean(bank_df1.salary)-np.std(bank_df1.salary)
57084.453697756246
32015.7455352698
89100.19923302604
25068.708162486444
bank_df1.salary.skew()
0.13913623752811108
np.log(bank_df1.salary).skew()
nan
sns.displot(np.log(bank_df1.salary), kde = True);
# Age
sns.displot(data=bank_df1,x='customer_age',kde=True)
<seaborn.axisgrid.FacetGrid at 0x1c9ce15ffd0>
bank_df1.customer_age.skew()
0.6851529717199948
np.log(bank_df1.customer_age).skew()
0.09882818718129278
sns.displot(np.log(bank_df1.customer_age), kde = True);
sns.displot(data=bank_df1,x='balance',kde=True)
<seaborn.axisgrid.FacetGrid at 0x1c9d488abb0>
bank_df1.balance.skew()
8.360308326166326
np.log(bank_df1.balance).skew()
nan
sns.displot(np.log(bank_df1.balance),kde=True)
<seaborn.axisgrid.FacetGrid at 0x1c9d2c0e970>
Now its looking normal distribution bell like curve
Trying to find relation between Job and house loan
# Fixing job column
bank_df1['Job'] = bank_df1.job_edu.apply(lambda x: x.split(',')[0])
bank_df1['Education'] = bank_df1.job_edu.apply(lambda x: x.split(',')[1])
bank_df1.head()
| customer_id | customer_age | salary | balance | marital | job_edu | targeted_before_or_not | default | housing_loan | loan | contact | durationof_call | campaign | pdays | previous | prv_outcome | response_after_call | Con_Date | Job | Education | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 58.0 | 100000.0 | 2143 | married | management,tertiary | yes | no | yes | no | unknown | 261 sec | 1 | -1 | 0 | unknown | no | 2017-05-05 | management | tertiary |
| 1 | 2 | 44.0 | 60000.0 | 29 | single | technician,secondary | yes | no | yes | no | unknown | 151 sec | 1 | -1 | 0 | unknown | no | 2017-05-05 | technician | secondary |
| 2 | 3 | 33.0 | 120000.0 | 2 | married | entrepreneur,secondary | yes | no | yes | yes | unknown | 76 sec | 1 | -1 | 0 | unknown | no | 2017-05-05 | entrepreneur | secondary |
| 3 | 4 | 47.0 | 20000.0 | 1506 | married | blue-collar,unknown | no | no | yes | no | unknown | 92 sec | 1 | -1 | 0 | unknown | no | 2017-05-05 | blue-collar | unknown |
| 4 | 5 | 33.0 | 0.0 | 1 | single | unknown,unknown | no | no | no | no | unknown | 198 sec | 1 | -1 | 0 | unknown | no | 2017-05-05 | unknown | unknown |
import scipy.stats as stats
data_crosstab = pd.crosstab(bank_df1.Job,bank_df1.housing_loan,margins=True,margins_name='Total')
data_crosstab
| housing_loan | no | yes | Total |
|---|---|---|---|
| Job | |||
| admin. | 1989 | 3182 | 5171 |
| blue-collar | 2684 | 7048 | 9732 |
| entrepreneur | 618 | 869 | 1487 |
| housemaid | 842 | 398 | 1240 |
| management | 4780 | 4678 | 9458 |
| retired | 1773 | 491 | 2264 |
| self-employed | 814 | 765 | 1579 |
| services | 1388 | 2766 | 4154 |
| student | 689 | 249 | 938 |
| technician | 3482 | 4115 | 7597 |
| unemployed | 760 | 543 | 1303 |
| unknown | 262 | 26 | 288 |
| Total | 20081 | 25130 | 45211 |
# significance level
alpha = 0.05
rows = bank_df1.Job.unique()
columns = bank_df1.housing_loan.unique()
print(rows, columns)
['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown' 'retired' 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid' 'student'] ['yes' 'no']
# Finding chi square value
chi_square = []
for i in columns:
for j in rows:
O = data_crosstab[i][j]
E = data_crosstab[i]['Total'] * data_crosstab['Total'][j] / data_crosstab['Total']['Total']
chi_square.append((O-E)**2/E)
chi_square
[63.794863997804036, 2.7470234319781315, 2.182112082553112, 496.3480499077622, 112.30424784699052, 467.99235587860153, 32.95365538421202, 90.47094588179588, 14.463534136876882, 45.36255439859088, 123.06371715292835, 142.29418193919253, 79.8349152066538, 3.4377122078387754, 2.7307642365698777, 621.1456846861244, 140.54109598102045, 585.6604702569223, 41.23924903168408, 113.21820975098515, 18.100125136184253, 56.768138640336076, 154.00583696295448, 178.07145023315113]
chi_square = np.sum(chi_square)
print(chi_square)
3588.7308943697108
# The critical value approach
print("\n--------------------------------------------------------------------------------------")
print(" The critical value approach to hypothesis testing in the decision rule")
critical_value = stats.chi2.ppf(1-alpha, (len(rows)-1)*(len(columns)-1))
if chi_square > critical_value:
conclusion = "Null Hypothesis is rejected."
else:
conclusion = "Failed to reject the null hypothesis."
print("chisquare-score is:", chi_square, " and critical value is:", critical_value)
print(conclusion)
-------------------------------------------------------------------------------------- The critical value approach to hypothesis testing in the decision rule chisquare-score is: 3588.7308943697108 and critical value is: 19.67513757268249 Null Hypothesis is rejected.
# Making contingency table JOb with respect to response after call
contigency_table = pd.crosstab(bank_df1.Job,bank_df1.response_after_call,margins=True,margins_name='Total')
contigency_table
| response_after_call | no | yes | Total |
|---|---|---|---|
| Job | |||
| admin. | 4540 | 631 | 5171 |
| blue-collar | 9024 | 708 | 9732 |
| entrepreneur | 1363 | 124 | 1487 |
| housemaid | 1131 | 109 | 1240 |
| management | 8157 | 1301 | 9458 |
| retired | 1747 | 517 | 2264 |
| self-employed | 1392 | 187 | 1579 |
| services | 3784 | 370 | 4154 |
| student | 669 | 269 | 938 |
| technician | 6757 | 840 | 7597 |
| unemployed | 1101 | 202 | 1303 |
| unknown | 254 | 34 | 288 |
| Total | 39919 | 5292 | 45211 |
p_No = 39919/45211
p_yes= 5292/45211
p_adm = 5171/45211
p_bc=9732/45211
p_ent = 1487/45211
p_hm = 1240/45211
p_manag = 9458/45211
p_ret = 2264/45211
p_semp = 1579/4511
p_ser = 4154/45211
p_std = 937/45211
p_tech = 7597/45211
p_unempl = 1303/45211
p_unkno = 288/45211
p_adm
p_bc
p_ent
p_hm
p_manag
p_No
p_ret
p_semp
p_ser
p_std
p_tech
p_unempl
p_unkno
p_yes
0.11437482028709828
0.21525734887527373
0.03289022582999712
0.027426953617482472
0.20919687686624935
0.882948839884099
0.05007630886288735
0.35003325205054314
0.09188029461856628
0.02072504478998474
0.16803432792904383
0.028820419809338436
0.006370131162770122
0.117051160115901
# Probability of Job with respect to resopnse of call
contigency_table/45211
| response_after_call | no | yes | Total |
|---|---|---|---|
| Job | |||
| admin. | 0.100418 | 0.013957 | 0.114375 |
| blue-collar | 0.199597 | 0.015660 | 0.215257 |
| entrepreneur | 0.030148 | 0.002743 | 0.032890 |
| housemaid | 0.025016 | 0.002411 | 0.027427 |
| management | 0.180421 | 0.028776 | 0.209197 |
| retired | 0.038641 | 0.011435 | 0.050076 |
| self-employed | 0.030789 | 0.004136 | 0.034925 |
| services | 0.083696 | 0.008184 | 0.091880 |
| student | 0.014797 | 0.005950 | 0.020747 |
| technician | 0.149455 | 0.018580 | 0.168034 |
| unemployed | 0.024352 | 0.004468 | 0.028820 |
| unknown | 0.005618 | 0.000752 | 0.006370 |
| Total | 0.882949 | 0.117051 | 1.000000 |
# Making contingency table JOb with respect to response after call
contigency_table = pd.crosstab(bank_df1.Education,bank_df1.response_after_call,margins=True,margins_name='Total')
contigency_table
| response_after_call | no | yes | Total |
|---|---|---|---|
| Education | |||
| primary | 6260 | 591 | 6851 |
| secondary | 20750 | 2452 | 23202 |
| tertiary | 11304 | 1997 | 13301 |
| unknown | 1605 | 252 | 1857 |
| Total | 39919 | 5292 | 45211 |
# Probability
contigency_table/45211
| response_after_call | no | yes | Total |
|---|---|---|---|
| Education | |||
| primary | 0.138462 | 0.013072 | 0.151534 |
| secondary | 0.458959 | 0.054235 | 0.513194 |
| tertiary | 0.250028 | 0.044171 | 0.294198 |
| unknown | 0.035500 | 0.005574 | 0.041074 |
| Total | 0.882949 | 0.117051 | 1.000000 |
plt.figure(figsize=(10,6),dpi=200)
plt.subplot(321)
sns.boxplot(data=bank_df1,x = 'salary')
plt.subplot(322)
sns.boxplot(data=bank_df1,x = 'balance')
plt.subplot(323)
sns.barplot(data = bank_df1, x = 'marital', y = 'salary', ci = None, palette='bright');
plt.subplot(324)
sns.barplot(data = bank_df1, x = 'marital', y = 'balance', ci = None, palette='bright');
plt.subplot(325)
sns.scatterplot(data =bank_df1, x = 'balance', y= 'customer_age');
plt.subplot(326)
sns.scatterplot(data =bank_df1, x = 'salary', y= 'customer_age');
# plt.tight_layout();
observation = In bar plot we can see that average salary of married customers are 57k and saving is more for single and divorced people